############################################################ ########### #
#
# Project: Stability and change in the opinion-policy relationship
# 
# This cleans rent data and election data and combines them
# 
# 2022.09.30. 
############################################################ ########### #


library(rio)
library(tidyverse)

rent1 <- import("A-original-data/Apartment_List_Rent_Data_-_State_2020-6.csv") %>% 
        filter(Bedroom_Size == "1br") %>% 
        select(-Location_Type, -Bedroom_Size) %>% 
        rename(state = Location) %>% 
        pivot_longer(contains("Price"), names_to = "year_month", values_to = "rent") %>% 
        mutate(year_month = gsub("Price_", "", year_month)) %>% 
        separate(year_month, into = c("year", "month"), sep = "_", remove = F) %>% 
        filter(year < 2017 | state == "Vermont") %>%
        glimpse


rent2 <- import("A-original-data/Apartment_List_Rent_Estimates_State_2021_2.csv") %>% 
        filter(Bedroom_Size == "1br") %>% 
        select(-Population, -FIPS_Code, -Bedroom_Size) %>% 
        rename(state = State_Name) %>% 
        pivot_longer(contains("20"), names_to = "year_month", values_to = "rent") %>% 
        separate(year_month, into = c("year", "month"), sep = "_", remove = F) %>% 
        glimpse

vermont <- data.frame(year = as.character(c(rep(2020, 6), 2021, 2021)), 
                      month = as.character(c(7:12, 1:2)), 
                      rent = NA, 
                      state = "Vermont") %>% 
        mutate(year_month = paste(year, month, sep = "_"))

rent <- bind_rows(rent1, rent2, vermont) %>% 
        # bind_rows(vermont) %>% glimpse
        mutate(year = as.double(year), 
               month = as.double(month),
               state = tolower(state)) %>% 
        arrange(state, year, month) %>% 
        glimpse

missing <- rent %>% group_by(state) %>% 
        summarise(na = sum(is.na(rent))) %>% 
        arrange(desc(na))


# we have missing values in Vermont! 
#       let's use the a simple linear interpolation function to impute
#       seems to work pretty well
# plot(rent$rent[rent$state=="Vermont"])
# plot(timetk::ts_impute_vec(rent$rent[rent$state=="Vermont"], period = 1,lambda ="auto"))

rent$rent[rent$state=="vermont"] <- timetk::ts_impute_vec(rent$rent[rent$state=="vermont"], period = 1,lambda ="auto")

frent <- rent %>% filter(month == 1)

# Presidential vote -------------------------------------------------------

pr_raw <- import("A-original-data/1976-2020-president.csv")

candidates <- c("OBAMA, BARACK H.", "ROMNEY, MITT", "MITT, ROMNEY", 
                "CLINTON, HILLARY", "TRUMP, DONALD J.", 
                "TRUMP, DONALD J.", "BIDEN, JOSEPH R. JR")
        
# filter(pr_raw, year >= 2012 & state == "MINNESOTA")
        
pr <- pr_raw %>% 
        # subset to post 2012 elections
        filter(year >= 2012 & 
                       # the two main candidates
                       candidate %in% candidates &
                       # throw out write in votes, except in DC
                       (state == "DISTRICT OF COLUMBIA" | writein == FALSE)) %>% 
        # romney has a strange name in one place, replace with original
        mutate(candidate = ifelse(candidate == "MITT, ROMNEY", "ROMNEY, MITT", candidate)) %>% 
        # in some cases (ny state) candidates run under multiple banners
        #       sum up votes across all
        group_by(candidate, year, state) %>% 
        summarise(votes = sum(candidatevotes), 
                  party = unique(party_detailed)) %>% 
        # and then filter out weird parties
        filter(grepl("DEMOCRAT", party) | grepl("REPUBLICAN", party))  %>%
        ungroup() %>%
        # next up we calculate two party voteshare
        # by grouping for state years
        group_by(state, year) %>%
        # calculating votes as share of twoparty total
        mutate(twoshare = votes / sum(votes), 
               year = year + 1,
               state = tolower(state)) %>% # change year to inauguration
        # keep only the democrats
        filter(grepl("DEMOCRAT", party)) %>%
        select(state, year, demshare = twoshare) %>%
        ungroup() %>% 
        glimpse()

pr %>% 
        filter(year == 2013) %>% 
        saveRDS("B-analysis-data/presvote2012.rds")

ma <- pr %>% 
        expand(state, year = 2013:2021) %>% 
        left_join(pr) %>%
        group_by(state) %>% 
        ## use a simple linear intraploation to calculate non-election years
        # mutate( #fancy method
        #         demshare1 = timetk::ts_impute_vec(demshare, period = 1, lambda = NULL),
        #        # manual method
        #        demshare2 = case_when(year == 2014 ~ 0.75*demshare[year == 2013] +
        #                                      0.25*demshare[year == 2017],
        #                              year == 2015 ~ 0.5*demshare[year == 2013] +
        #                                      0.5*demshare[year == 2017],
        #                              year == 2016 ~ 0.25*demshare[year == 2013] +
        #                                      0.75*demshare[year == 2017],
        #                              year == 2018 ~ 0.75*demshare[year == 2017] +
        #                                      0.25*demshare[year == 2021],
        #                              year == 2019 ~ 0.5*demshare[year == 2017] +
        #                                      0.5*demshare[year == 2021],
        #                              year == 2020 ~ 0.25*demshare[year == 2017] +
        #                                      0.75*demshare[year == 2021],
        #                              TRUE ~ demshare)
        #        ) %>%
        ## use a fancy formula to intrapolate
        mutate(demshare = timetk::ts_impute_vec(demshare, period = 1, lambda = NULL)) %>%
        glimpse

# when running both fancy and manual, double check they are identical
# stopifnot(cor(ma$demshare1, ma$demshare2) == 1)


rentvote <- left_join(ma, frent)
saveRDS(rentvote, "B-analysis-data/final_rentvote.rds")
